﻿// Accord Machine Learning Library
// The Accord.NET Framework
// http://accord-framework.net
//
// Copyright © César Souza, 2009-2017
// cesarsouza at gmail.com
//
//    This library is free software; you can redistribute it and/or
//    modify it under the terms of the GNU Lesser General Public
//    License as published by the Free Software Foundation; either
//    version 2.1 of the License, or (at your option) any later version.
//
//    This library is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//    Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public
//    License along with this library; if not, write to the Free Software
//    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
//

namespace Accord.MachineLearning
{
    using Accord.Math;
    using System;
    using System.Collections.Generic;
    using System.Threading;
    using Accord.Compat;
    using System.Threading.Tasks;

#if !NET35 && !NET40
    using System.Text.RegularExpressions;
    using System.Collections.ObjectModel;
    using System.Collections.Concurrent;
#else
    using Accord.Collections;
#endif

    /// <summary>
    ///   Weighting schemes for term-frequency (TF).
    /// </summary>
    /// 
    public enum TermFrequency
    {
        /// <summary>
        ///   Binary TF variant (0, 1).
        /// </summary>
        /// 
        Binary,

        /// <summary>
        ///   Raw frequency variant (f_{t,d}).
        /// </summary>
        /// 
        Default,

        /// <summary>
        ///   Log normalization (1 + log(f_{t,d})).
        /// </summary>
        Log,

        /// <summary>
        ///   Double normalization (0.5 + 0.5 { f_{t,d} / max_t'{ f_{t',d} } }
        /// </summary>
        /// 
        DoubleNormalization
    }

    /// <summary>
    ///   Weighting schemes for Inverse Document Frequency (IDF).
    /// </summary>
    /// 
    public enum InverseDocumentFrequency
    {
        /// <summary>
        ///   Unary (1).
        /// </summary>
        /// 
        Unary,

        /// <summary>
        ///   Inverse document frequency, log(N / n_t).
        /// </summary>
        /// 
        Default,

        /// <summary>
        ///   Smooth inverse document frequency, log(N / (1 + n_t)).
        /// </summary>
        /// 
        Smooth,

        /// <summary>
        ///   Max inverse document frequency, log( max_t'{n_t} / n_t).
        /// </summary>
        Max,

        /// <summary>
        ///   Probabilistic inverse document frequency, log((N -n_t) / n_t).
        /// </summary>
        /// 
        Probabilistic
    }

    /// <summary>
    ///   Term Frequency - Inverse Term Frequency.
    /// </summary>
    /// 
    /// <example>
    ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
    /// </example>
    /// 
    [Serializable]
    public class TFIDF : ParallelLearningBase,
        ITransform<string[], double[]>,
        ITransform<string[], Sparse<double>>,
        IUnsupervisedLearning<TFIDF, string[], double[]>
    {
        BagOfWords bow;
        int[] counts;
        int numberOfDocuments;

        TermFrequency tf = TermFrequency.Default;
        InverseDocumentFrequency idf = Accord.MachineLearning.InverseDocumentFrequency.Default;

        double[] inverseDocumentFrequency;

        /// <summary>
        ///   Gets the number of documents that contain each code word. Each element 
        ///   is associated with a word, and the value of the element gives the number 
        ///   of documents that contain this word.
        /// </summary>
        /// 
        public int[] Counts
        {
            get { return counts; }
        }

        /// <summary>
        ///   Gets the total number of documents considered by this TF-IDF.
        /// </summary>
        /// 
        public int NumberOfDocuments
        {
            get { return numberOfDocuments; }
        }

        /// <summary>
        ///   Gets the number of words in this codebook.
        /// </summary>
        /// 
        public int NumberOfWords
        {
            get { return bow.NumberOfWords; }
        }

        /// <summary>
        /// Gets the number of inputs accepted by the model.
        /// </summary>
        /// 
        /// <value>The number of inputs.</value>
        /// 
        public int NumberOfInputs
        {
            get { return bow.NumberOfInputs; }
            set { throw new InvalidOperationException("This property is read-only."); }
        }

        /// <summary>
        /// Gets the number of outputs generated by the model.
        /// </summary>
        /// 
        /// <value>The number of outputs.</value>
        /// 
        public int NumberOfOutputs
        {
            get { return bow.NumberOfOutputs; }
            set { throw new InvalidOperationException("This property is read-only."); }
        }

        /// <summary>
        ///   Gets or sets the inverse document frequency (IDF) definition to be used.
        /// </summary>
        /// 
        public InverseDocumentFrequency Idf
        {
            get { return idf; }
            set { idf = value; }
        }

        /// <summary>
        ///   Gets or sets the term frequency (TF) definition to be used.
        /// </summary>
        /// 
        public TermFrequency Tf
        {
            get { return tf; }
            set { tf = value; }
        }

        /// <summary>
        /// Gets or sets a value indicating whether new words should be added to the
        /// dictionary in the next call to <see cref="Learn(string[][], double[])"/>.
        /// </summary>
        /// 
        public bool UpdateDictionary { get; set; }

        /// <summary>
        ///   Gets the inverse document frequency vector used to scale term-frequency vectors.
        /// </summary>
        /// 
        public double[] InverseDocumentFrequency
        {
            get { return inverseDocumentFrequency; }
        }

        /// <summary>
        ///   Constructs a new <see cref="TFIDF"/>.
        /// </summary>
        /// 
        public TFIDF()
        {
            this.bow = new BagOfWords()
            {
                MaximumOccurance = Int32.MaxValue
            };

            this.UpdateDictionary = true;
        }

        /// <summary>
        ///   Constructs a new <see cref="TFIDF"/>.
        /// </summary>
        /// 
        public TFIDF(params string[][] texts)
            : this()
        {
            bow.Learn(texts);
            UpdateDictionary = false;
        }



        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public TFIDF Learn(string[][] x, double[] weights = null)
        {
            if (weights != null)
                throw new ArgumentException(Accord.Properties.Resources.NotSupportedWeights, "weights");

            if (UpdateDictionary)
                bow.Learn(x);

            IDictionary<string, int> codebook = bow.StringToCode;
            int[] totalCounts = new int[NumberOfWords];

            Parallel.For(0, x.Length, ParallelOptions,

                () => Tuple.Create(new int[NumberOfWords], new bool[NumberOfWords]),

                (i, state, work) =>
                {
                    int[] partialCount = work.Item1;
                    bool[] instanceCount = work.Item2;
                    string[] words = x[i];

                    for (int j = 0; j < words.Length; j++)
                    {
                        int k;
                        if (!codebook.TryGetValue(words[j], out k))
                            continue;

                        instanceCount[k] = true;
                    }

                    for (int j = 0; j < instanceCount.Length; j++)
                        if (instanceCount[j])
                            partialCount[j]++;

                    Array.Clear(instanceCount, 0, instanceCount.Length);
                    return Tuple.Create(partialCount, instanceCount);
                },

                (work) =>
                {
                    int[] partialCount = work.Item1;
                    for (int i = 0; i < totalCounts.Length; i++)
                        Interlocked.Add(ref totalCounts[i], partialCount[i]);
                });

            this.counts = totalCounts;
            this.numberOfDocuments = x.Length;
            this.inverseDocumentFrequency = new double[NumberOfWords];

            switch (idf)
            {
                case Accord.MachineLearning.InverseDocumentFrequency.Unary:
                    for (int i = 0; i < counts.Length; i++)
                        inverseDocumentFrequency[i] = 1;
                    break;
                case Accord.MachineLearning.InverseDocumentFrequency.Default:
                    for (int i = 0; i < counts.Length; i++)
                        if (counts[i] != 0)
                            inverseDocumentFrequency[i] = Math.Log(x.Length / (double)counts[i]);
                    break;
                case Accord.MachineLearning.InverseDocumentFrequency.Smooth:
                    for (int i = 0; i < counts.Length; i++)
                        inverseDocumentFrequency[i] = Math.Log(x.Length / (1.0 + (double)counts[i]));
                    break;
                case Accord.MachineLearning.InverseDocumentFrequency.Max:
                    double max = counts.Max();
                    for (int i = 0; i < counts.Length; i++)
                        inverseDocumentFrequency[i] = Math.Log(max / (1.0 + (double)counts[i]));
                    break;
                case Accord.MachineLearning.InverseDocumentFrequency.Probabilistic:
                    for (int i = 0; i < counts.Length; i++)
                        if (counts[i] != 0)
                            inverseDocumentFrequency[i] = Math.Log((x.Length - counts[i]) / (double)counts[i]);
                    break;
                default:
                    throw new InvalidOperationException("Unknown InverseDocumentFrequency: {0}".Format(idf));
            }

            Accord.Diagnostics.Debug.Assert(!inverseDocumentFrequency.HasNaN());
            Accord.Diagnostics.Debug.Assert(!inverseDocumentFrequency.HasInfinity());

            return this;
        }


        /// <summary>
        /// Applies the transformation to a set of input vectors,
        /// producing an associated set of output vectors.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public double[][] Transform(string[][] input, double[][] result)
        {
            Parallel.For(0, input.Length, ParallelOptions,
                () => (new double[NumberOfWords]),

                (i, state, work) =>
                {
                    Sparse<double> r;
                    Transform(input[i], out r, work);
                    Array.Clear(work, 0, work.Length);
                    r.CopyTo(result[i], 0);
                    return work;
                },

                (work) =>
                {
                });

            return result;
        }

        /// <summary>
        /// Applies the transformation to a set of input vectors,
        /// producing an associated set of output vectors.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public Sparse<double>[] Transform(string[][] input, out Sparse<double>[] result)
        {
            return result = Transform(input, new Sparse<double>[input.Length]);
        }

        /// <summary>
        /// Applies the transformation to a set of input vectors,
        /// producing an associated set of output vectors.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public Sparse<double>[] Transform(string[][] input, Sparse<double>[] result)
        {
            Parallel.For(0, input.Length, ParallelOptions,
                () => new double[NumberOfWords],

                (i, state, work) =>
                {
                    Transform(input[i], out result[i], work);
                    Array.Clear(work, 0, work.Length);
                    return work;
                },

                (work) =>
                {
                });

            return result;
        }

        /// <summary>
        /// Applies the transformation to an input, producing an associated output.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public double[] Transform(string[] input, double[] result)
        {
            Sparse<double> r;
            Transform(input, out r, new double[NumberOfWords]);
            r.CopyTo(result, 0);
            return result;
        }

        /// <summary>
        /// Applies the transformation to an input, producing an associated output.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public Sparse<double> Transform(string[] input, Sparse<double> result)
        {
            return Transform(input, out result, new double[NumberOfWords]);
        }


        /// <summary>
        /// Applies the transformation to an input, producing an associated output.
        /// </summary>
        /// <param name="input">The input data to which the transformation should be applied.</param>
        /// <returns>The output generated by applying this transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public double[] Transform(string[] input)
        {
            return Transform(input, new double[NumberOfWords]);
        }

        /// <summary>
        /// Applies the transformation to a set of input vectors,
        /// producing an associated set of output vectors.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public double[][] Transform(string[][] input)
        {
            return Transform(input, Jagged.Zeros(input.Length, NumberOfWords));
        }

        /// <summary>
        /// Applies the transformation to an input, producing an associated output.
        /// </summary>
        /// <param name="input">The input data to which
        /// the transformation should be applied.</param>
        /// <param name="result">The location to where to store the
        /// result of this transformation.</param>
        /// <returns>The output generated by applying this
        /// transformation to the given input.</returns>
        /// 
        /// <example>
        ///   <para>Examples are available in the main documentation page for 
        ///   <see cref="TFIDF"/>. One of those examples is reproduced below:</para>
        ///   <code source="Unit Tests\Accord.Tests.MachineLearning\TFIDFTest.cs" region="doc_learn"/>
        /// </example>
        /// 
        public Sparse<double> Transform(string[] input, out Sparse<double> result)
        {
            return Transform(input, out result, new double[NumberOfWords]);
        }






        Sparse<double> ICovariantTransform<string[], Sparse<double>>.Transform(string[] input)
        {
            Sparse<double> r;
            return Transform(input, out r);
        }

        Sparse<double>[] ICovariantTransform<string[], Sparse<double>>.Transform(string[][] input)
        {
            return Transform(input, new Sparse<double>[input.Length]);
        }



        private Sparse<double> Transform(string[] x, out Sparse<double> sparse, double[] work)
        {
            IDictionary<string, int> codebook = bow.StringToCode;

            for (int j = 0; j < x.Length; j++)
            {
                int k;
                if (!codebook.TryGetValue(x[j], out k))
                    continue;

                work[k]++;
            }

            sparse = Sparse.FromDense(work);

            switch (tf)
            {
                case TermFrequency.Binary:
                    for (int j = 0; j < sparse.Values.Length; j++)
                        sparse.Values[j] = sparse.Values[j] = 1;
                    break;
                case TermFrequency.Default:
                    break;
                case TermFrequency.Log:
                    for (int j = 0; j < sparse.Values.Length; j++)
                        sparse.Values[j] = 1 + Math.Log(sparse.Values[j]);
                    break;
                case TermFrequency.DoubleNormalization:
                    double max = sparse.Values.Max();
                    for (int j = 0; j < sparse.Values.Length; j++)
                        sparse.Values[j] = 0.5 + 0.5 * (sparse.Values[j] / max);
                    break;
                default:
                    throw new InvalidOperationException("Unknown TermFrequency: {0}".Format(tf));
            }

            // Divide by the inverse document frequency
            for (int j = 0; j < sparse.Values.Length; j++)
            {
                double a = sparse.Values[j];
                int k = sparse.Indices[j];
                double v = a * inverseDocumentFrequency[k];

#if DEBUG
                if (Double.IsNaN(v) || Double.IsInfinity(v))
                    throw new Exception();
#endif
                sparse.Values[j] = v;
            }

            return sparse;
        }
    }
}
